'''
process_diskfile
In first step, we extract web pages we cared to disk
This is the second step, process files on disk and extract useful content and insert into database
File name should be parsed because they are useful for distinguish
'''
import parse_insert
import os.path
import os
import sys
import re

def walkDir(dir_name):
    parse = Parse()
    parse.setDatabaseConnector('10.103.21.244', 'root', '1', 'weibo')
    try:
        ls = os.listdir(dir_name)
    except:
        print 'access deny'
    else:
        for f in ls:
            processFile(dir_name, f, parse)

def processFile(dir_name, f, parse):
    uid = f[:10]
    p1 = re.compile('friend')
    p2 = re.compile('follow')
    matched1 = p1.findall(f)
    matched2 = p2.findall(f)
    if len(matched1)==0 and len(matched2)==0:
        pass
    else if len(matched1)==0:
        parse.setFollowerFlag(True)
    else:
        parse.setFollowerFlag(False)
    htmls = getHtmls(dir_name, f)
    for html in htmls:
        parse.setHtml(html)
        parse.parseHtmlAndInsert(uid)

def filterJson(json_obj):
    if json_obj['html'] is None:
        return False

def getHtmls(dir_name, f):
    htmls = []
    file_obj = open(dir_name+f, 'r')
    page = file_obj.read()
    regex = re.compile('STK && STK\.pageletM && STK\.pageletM\.view\((.*)\)')
    for mat in regex.finditer(page):
        json_obj = simplejson.loads(mat.group(1))
        if not filerJson(json_obj):
            continue
        else:
            htmls.append(json_obj['html'])
    file_obj.close()
    return htmls


if __main__ = "__main__":
    workDir('pages/')
